{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "88716015",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = ''\n",
    "os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "6dc057d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://huggingface.co/huseinzol05/bpe/resolve/main/ms-en.subwords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "ea89d350",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:Deprecation warnings have been disabled. Set TF_ENABLE_DEPRECATION_WARNINGS=1 to re-enable them.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-07-04 23:01:32.983758: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0\n",
      "/home/ubuntu/tf-nvidia/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "from malaya.text.t2t import text_encoder\n",
    "import malaya"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "73a93be0",
   "metadata": {},
   "outputs": [],
   "source": [
    "encoder = text_encoder.SubwordTextEncoder('ms-en.subwords')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "3b11e900",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "''"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "encoder.decode([25891])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "87943d78",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[197]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "encoder.encode('saya')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "6bde4831",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "25880"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "encoder.vocab_size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "49a59f3c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !~/tf-nvidia/bin/pip3 install fasttext"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "80e9cf0c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n"
     ]
    }
   ],
   "source": [
    "fast_text = malaya.language_detection.fasttext()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "38c3e1ef",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['malay', 'eng']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fast_text.predict(['saya suka', 'i like'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "d7bc9727",
   "metadata": {},
   "outputs": [],
   "source": [
    "class Encoder:\n",
    "    def __init__(self, encoder):\n",
    "        self.encoder = encoder\n",
    "        self.vocab_size = encoder.vocab_size\n",
    "\n",
    "    def encode(self, s):\n",
    "        s = [self.encoder.encode(s_) for s_ in s]\n",
    "        s = [i + [1] for i in s]\n",
    "        return s\n",
    "\n",
    "    def decode(self, ids, strip_extraneous = False):\n",
    "        return self.encoder.decode(ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "9ba85517",
   "metadata": {},
   "outputs": [],
   "source": [
    "s_encoder = Encoder(encoder)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "629e6e19",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tensor2tensor.data_generators import problem\n",
    "from tensor2tensor.data_generators import text_problems\n",
    "from tensor2tensor.utils import registry\n",
    "from tqdm import tqdm\n",
    "from glob import glob\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "70526163",
   "metadata": {},
   "outputs": [],
   "source": [
    "@registry.register_problem\n",
    "class Translation(text_problems.Text2TextProblem):\n",
    "    @property\n",
    "    def approx_vocab_size(self):\n",
    "        return encoder.vocab_size\n",
    "\n",
    "    @property\n",
    "    def is_generate_per_split(self):\n",
    "        # generate_data will shard the data into TRAIN and EVAL for us.\n",
    "        return False\n",
    "\n",
    "    @property\n",
    "    def dataset_splits(self):\n",
    "        return [\n",
    "            {'split': problem.DatasetSplit.EVAL, 'shards': 1},\n",
    "        ]\n",
    "\n",
    "    def generate_samples(self, data_dir, tmp_dir, dataset_split):\n",
    "                \n",
    "        with open('test/left.txt') as fopen:\n",
    "            left = fopen.read().split('\\n')\n",
    "        \n",
    "        with open('test/right.txt') as fopen:\n",
    "            right = fopen.read().split('\\n')\n",
    "            \n",
    "        for i in tqdm(range(len(left))):\n",
    "            if len(left[i]) and len(right[i]):\n",
    "                lang_left, lang_right = fast_text.predict([left[i], right[i]])\n",
    "                if lang_left not in ['malay', 'eng']:\n",
    "                    continue\n",
    "                if lang_right not in ['eng']:\n",
    "                    continue\n",
    "                i, o = s_encoder.encode([left[i], right[i]])\n",
    "                yield {'inputs': i, 'targets': o}\n",
    "                \n",
    "        with open('augmented-ms-en-test.json') as fopen:\n",
    "            data = json.load(fopen)\n",
    "\n",
    "        for i in tqdm(range(len(data['ms']))):\n",
    "            if len(data['ms'][i]) and len(data['en'][i]):\n",
    "                i, o = s_encoder.encode([data['ms'][i], data['en'][i]])\n",
    "                yield {'inputs': i, 'targets': o}\n",
    "\n",
    "    def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):\n",
    "\n",
    "        generator = self.generate_samples(data_dir, tmp_dir, dataset_split)\n",
    "        for sample in generator:\n",
    "            yield sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "c74aa440",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import tensorflow as tf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "160edb4d",
   "metadata": {},
   "outputs": [],
   "source": [
    "DATA_DIR = os.path.expanduser('t2t-noisy-ms-en/data')\n",
    "TMP_DIR = os.path.expanduser('t2t-noisy-ms-en/tmp')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "a420c0d0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /tmp/ipykernel_27856/2493044156.py:1: The name tf.gfile.MakeDirs is deprecated. Please use tf.io.gfile.makedirs instead.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "tf.gfile.MakeDirs(DATA_DIR)\n",
    "tf.gfile.MakeDirs(TMP_DIR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "deaf62c6",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tensor2tensor.utils import registry\n",
    "from tensor2tensor import problems"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "e02f52a2",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "  0%|                                                                                                     | 0/100000 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 0.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 0.\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:15<00:00, 6536.22it/s]\n",
      " 29%|█████████████████████████▎                                                             | 19118/65642 [00:01<00:03, 13665.07it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 100000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 100000.\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████████| 65642/65642 [00:04<00:00, 13209.64it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generated 145209 Examples\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "INFO:tensorflow:Generated 145209 Examples\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Shuffling data...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Shuffling data...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:read: 100000\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:read: 100000\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:write: 100000\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:write: 100000\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Data shuffled.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Data shuffled.\n"
     ]
    }
   ],
   "source": [
    "PROBLEM = 'translation'\n",
    "t2t_problem = problems.problem(PROBLEM)\n",
    "t2t_problem.generate_data(DATA_DIR, TMP_DIR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "54985a1f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "tf1",
   "language": "python",
   "name": "tf1"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
